Applying our zero-inflated model to the olfactory data.
We select only the cells that pass QC, color coded by the experimental time point. I will focus on the top 1,000 most variable genes. Note that the data are not public, hence it should not work other than on Davide’s computer.
load("Expt4c2b_filtdata.Rda")
load("E4c2b_slingshot_wsforkelly.RData")
To speed up the computations, I will focus on the top 1,000 most variable genes.
names(batch) <- colnames(counts)
counts <- counts[,names(clus.labels)]
batch <- droplevels(batch[names(clus.labels)])
qc <- qc[names(clus.labels),]
vars <- rowVars(log1p(counts))
names(vars) <- rownames(counts)
vars <- sort(vars, decreasing = TRUE)
core <- counts[names(vars)[1:1000],]
First, let’s look at PCA (of the log counts) for reference.
par(mfrow=c(1, 2))
detection_rate <- colSums(core>0)
coverage <- colSums(core)
colMerged <- cc_rev[clus.labels]
pca <- prcomp(t(log1p(core)))
plot(pca$x, col=colMerged, pch=19, main="PCA of log-counts, centered not scaled")
fq <- EDASeq::betweenLaneNormalization(counts, which="full")
pcafq <- prcomp(t(log1p(fq)))
plot(pcafq$x, col=colMerged, pch=19, main="PCA of FQ log-counts, centered not scaled")
The first plot is the unnormalized data and the second plot is after full-quantile normalization, which is what Russell and Diya used for the paper.
They found that to fully explain the differences between clusters, we need three dimensions.
pairs(pcafq$x[,1:3], col=colMerged, pch=19, main="PCA of FQ log-counts, centered not scaled")
df <- data.frame(PC1=pcafq$x[,1], PC2=pcafq$x[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)
print(cor(df, method="spearman"))
## PC1 PC2 detection_rate coverage
## PC1 1.00000000 -0.0432585 0.2643194 -0.02214266
## PC2 -0.04325850 1.0000000 0.6508453 0.19979644
## detection_rate 0.26431940 0.6508453 1.0000000 0.41994811
## coverage -0.02214266 0.1997964 0.4199481 1.00000000
Even after full-quantile normalization, there is some residual correlation between PC2 and detection rate.
print(system.time(zinb_Vall <- zinbFit(core, ncores = 3, K = 3)))
## user system elapsed
## 221.201 37.048 108.063
pairs(zinb_Vall@W, col=colMerged, pch=19, main="ZINB")
qcpca <- prcomp(qc, center=TRUE, scale.=TRUE)
df <- data.frame(W1=zinb_Vall@W[,1], W2=zinb_Vall@W[,2], W3=zinb_Vall@W[,3], gamma_mu = zinb_Vall@gamma_mu[1,], gamma_pi = zinb_Vall@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage, quality=qcpca$x[,1])
pairs(df, pch=19, col=colMerged)
print(cor(df, method="spearman"))
## W1 W2 W3 gamma_mu
## W1 1.00000000 -0.16601378 -0.104938063 0.03575507
## W2 -0.16601378 1.00000000 0.040478697 0.01093061
## W3 -0.10493806 0.04047870 1.000000000 -0.17209802
## gamma_mu 0.03575507 0.01093061 -0.172098021 1.00000000
## gamma_pi 0.26329101 0.05547144 -0.018388095 -0.09512231
## detection_rate -0.34471857 -0.13187287 -0.023141527 0.22821064
## coverage 0.08169276 -0.04792555 0.008233665 0.90198032
## quality 0.10834219 0.08748443 -0.006226189 -0.55756193
## gamma_pi detection_rate coverage quality
## W1 0.26329101 -0.34471857 0.081692763 0.108342193
## W2 0.05547144 -0.13187287 -0.047925554 0.087484425
## W3 -0.01838809 -0.02314153 0.008233665 -0.006226189
## gamma_mu -0.09512231 0.22821064 0.901980318 -0.557561930
## gamma_pi 1.00000000 -0.96579515 -0.318289834 0.298937557
## detection_rate -0.96579515 1.00000000 0.419948109 -0.347477558
## coverage -0.31828983 0.41994811 1.000000000 -0.609447350
## quality 0.29893756 -0.34747756 -0.609447350 1.000000000
mod <- model.matrix(~qcpca$x[,1:5])
print(system.time(zinb_3 <- zinbFit(core, ncores = 3, K = 3, X=mod)))
## user system elapsed
## 380.032 40.190 172.770
pairs(zinb_3@W, col=colMerged, pch=19, main="ZINB")
#total number of detected genes in the cell
df <- data.frame(W1=zinb_3@W[,1], W2=zinb_3@W[,2], W3=zinb_3@W[,3], gamma_mu = zinb_3@gamma_mu[1,], gamma_pi = zinb_3@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)
print(cor(df, method="spearman"))
## W1 W2 W3 gamma_mu
## W1 1.000000000 0.008567876 0.130462367 0.09893941
## W2 0.008567876 1.000000000 0.001914656 0.11134599
## W3 0.130462367 0.001914656 1.000000000 -0.03719696
## gamma_mu 0.098939405 0.111345989 -0.037196957 1.00000000
## gamma_pi -0.072167019 0.058346532 -0.129945032 -0.28471602
## detection_rate 0.141119099 -0.130270900 0.153378775 0.48856596
## coverage 0.024521464 0.121797564 0.106339543 0.86995162
## gamma_pi detection_rate coverage
## W1 -0.07216702 0.1411191 0.02452146
## W2 0.05834653 -0.1302709 0.12179756
## W3 -0.12994503 0.1533788 0.10633954
## gamma_mu -0.28471602 0.4885660 0.86995162
## gamma_pi 1.00000000 -0.9405971 -0.25089634
## detection_rate -0.94059713 1.0000000 0.41994811
## coverage -0.25089634 0.4199481 1.00000000
print(system.time(zinb_4 <- zinbFit(core, ncores = 3, K = 4)))
## user system elapsed
## 252.835 28.154 116.245
pairs(zinb_4@W, col=colMerged, pch=19, main="ZINB")